Assemble all files and meta data

Import cluster meta data

Should be a total of 121 files named: Cluster{a}.{b}.RA{c}.csv where:

  1. cluster number
  2. section number
  3. patient number
# Import cluster data from directory with cluster csv files
# for example: Cluster1.1.RA1.csv, Cluster1.2.RA1.csv, ...
cluster.files <- grep("Cluster", list.files("../../data/", full.names = TRUE), value = TRUE)
d <- do.call(rbind, strsplit(do.call(rbind, strsplit(cluster.files, "/"))[, 5], "\\."))
d[, 1] <- substr(d[, 1], start = 8, stop = 8)
d <- data.frame(cluster = as.numeric(d[, 1]), 
                section = as.numeric(d[, 2]),
                patient = d[, 3],
                id = paste0(d[, 3], "x", d[, 2]))
d$path <- cluster.files

# Display data for html report
DT::datatable(head(d))

Import expression meta data

Should be a total of 27 files named: RA{a}Raw.exp{b}.csv where:

  1. patient number
  2. section number
# Import expression matrices
expr.files <- grep("Raw.exp", list.files("../../data", full.names = TRUE), value = TRUE)
dexpr <- do.call(rbind, strsplit(do.call(rbind, strsplit(expr.files, "/"))[, 4], "_"))
dexpr <- data.frame(patient = dexpr[, 1], section = as.numeric(substr(dexpr[, 3], 1, 1)))
dexpr$id <- paste0(dexpr$patient, "x", dexpr$section)
dexpr$path <- expr.files
conv <- setNames(dexpr$path, nm = dexpr$id)

# Merge meta data
d$expr_path <- conv[d$id]
d <- na.omit(d)

# Display data for html report
DT::datatable(head(d))

Import and merge expression data

# Iterate through each patient
res <- lapply(unique(d$patient), function(patient_id) {
  
  print(patient_id)
  d.subset.tmp <- subset(d, patient %in% patient_id)
  
  # Iterate through each section
  tst <- lapply(unique(d.subset.tmp$section), function(section_id) {
    
    print(section_id)
    
    # Subset meta data to include correct patient and section number
    d.subset <- subset(d, patient %in% patient_id & section %in% section_id)
    
    # Iterate through each cluster and collect spatial coordinates
    coords <- do.call(rbind, lapply(1:nrow(d.subset), function(i) {
      df <- read.table(d.subset$path[i], sep = ",", header = TRUE)
      df <- df[, 1:3]
      df$cluster <- d.subset$cluster[i]
      return(df)
    }))
    coords$ID <- paste0(coords$x, "_", coords$y)
    coords <- coords[!duplicated(coords$ID), ]
    rownames(coords) <- paste0(coords$x, "_", coords$y)
    #exprMat <- read.table(unique(d.subset$expr_path), sep = ",", header = TRUE)
    
    # Read expression data
    exprMat <- read.table(unique(d.subset$expr_path), sep = ",", header = TRUE, row.names = 1)
    colnames(exprMat) <- gsub(pattern = "^X", replacement = "", x = colnames(exprMat))
    
    # Define intersecting spots
    spots.keep <- intersect(rownames(coords), colnames(exprMat))
    
    # Subset expression matrix and coords
    exprMat <- exprMat[, spots.keep]
    coords <- coords[spots.keep, ]
    combined <- cbind(coords, t(exprMat))
    rownames(combined) <- paste0(combined$Image_ID, "x", combined$ID)
    return(combined)
  })
  
  # Make sure that only intersecting genes are kept
  intersecting.genes <- Reduce(intersect, lapply(tst, colnames))
  tst <- do.call(rbind, lapply(tst, function(x) x[, intersecting.genes]))
  
  return(tst)
})
## [1] "RA1"
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] "RA2"
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] "RA3"
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] "RA4"
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] "RA5"
## [1] 1
## [1] 2
## [1] 3
## [1] "RA6"
## [1] 1
## [1] 2
## [1] 3
## [1] 4
# Again, make sure that only intersecting genes are kept but this time
# across the merged patient datasets
intersecting.columns <- Reduce(intersect, lapply(res, colnames))

# Merge patient datasets (10879 columns and 16684 rows)
res.merged <- do.call(rbind, lapply(res, function(x) x[, intersecting.columns]))

# Display data for html report
DT::datatable(res.merged[1:5, 1:10])

Check cluster distributions across all datasets

library(ggplot2)

p <- ggplot(res.merged, aes(x, y, color = factor(cluster))) + 
  geom_point(size = 0.5) +
  facet_wrap(~Image_ID) + 
  theme_void() +
  labs(color = "cluster")
p

png(filename = "../all_clusters/cluster_overview.png", width = 3000, height = 3000, res = 300)
print(p)
dev.off()
## png 
##   2

Create metadata for cellphonedb

library(data.table)

# One column with cell name (proxy for "spot") and one column with cluster id
metadata <- data.frame(cell = rownames(res.merged), cluster = res.merged$cluster)

# Add column for cluster 1 vs background comparison
# All clusters except 1 are treated as "background"
metadata$aggr_cluster <- ifelse(metadata$cluster == 1, "cluster_1", "background")

# Add patient id 
patient <- substr(metadata$cell, 1, 3)

# Add column for seropositive vs seronegative comparison
metadata$cluster_1 <- paste0(patient, "_", metadata$cluster)
metadata$cluster_1 <- ifelse(metadata$cluster_1 %in% c("RA1_1", "RA2_1", "RA3_1"), "cluster_1_seropositive", metadata$cluster_1)
metadata$cluster_1 <- ifelse(metadata$cluster_1 %in% c("RA4_1", "RA5_1", "RA6_1"), "cluster_1_seronegative", metadata$cluster_1)

# Display data for html report
DT::datatable(head(metadata))

Subset res.merged to include expression data only and transpose for cellphonedb. Add a column named “Gene” to keep gene symbols in.

exprMat <- t(res.merged[, 6:ncol(res.merged)])
exprMat <- data.frame(Gene = rownames(exprMat), exprMat)

Export metadata and expression data for cluster 1 vs background comparison in seropositive patients; RA1, RA2 and RA3

# Export data for sections 1 to 3, cluster 1 vs all
mdata.RA1to3_cluster_1_vs_all <- setNames(metadata[patient %in% c("RA1", "RA2", "RA3"), c("cell", "aggr_cluster")], nm = c("cell", "cluster"))
write.table(x = mdata.RA1to3_cluster_1_vs_all, file = "../../cellphonedb/input/meta_RA1to3_cluster_1_vs_all.txt", quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE)
fwrite(x = exprMat[, c(1, which(patient %in% c("RA1", "RA2", "RA3")) + 1)], file = "../../cellphonedb/input/counts_RA1to3_cluster_1_vs_all.txt", sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE)
mdata.RA1to3_cluster_1_vs_all <- cbind(mdata.RA1to3_cluster_1_vs_all, res.merged[mdata.RA1to3_cluster_1_vs_all$cell, c("Image_ID", "x", "y")])

# Plot selections for sanity check
p <- ggplot(mdata.RA1to3_cluster_1_vs_all, aes(x, y, color = factor(cluster))) + 
  geom_point(size = 0.5) +
  facet_wrap(~Image_ID) + 
  theme_void() +
  labs(color = "cluster")
p

png(filename = "../all_clusters/cluster_1_vs_all_RA1to3_overview.png", width = 3000, height = 3000, res = 300)
print(p)
dev.off()
## png 
##   2

Export metadata and expression data for cluster 1 vs background comparison in seronegative patients; RA4, RA5 and RA6

# Export data for sections 4 to 6, cluster 1 vs all
mdata.RA4to6_cluster_1_vs_all <- setNames(metadata[patient %in% c("RA4", "RA5", "RA6"), c("cell", "aggr_cluster")], nm = c("cell", "cluster"))
write.table(x = mdata.RA4to6_cluster_1_vs_all, file = "../../cellphonedb/input/meta_RA4to6_cluster_1_vs_all.txt", quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE)
fwrite(x = exprMat[, c(1, which(patient %in% c("RA4", "RA5", "RA6")) + 1)], file = "../../cellphonedb/input/counts_RA4to6_cluster_1_vs_all.txt", sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE)
mdata.RA4to6_cluster_1_vs_all <- cbind(mdata.RA4to6_cluster_1_vs_all, res.merged[mdata.RA4to6_cluster_1_vs_all$cell, c("Image_ID", "x", "y")])

# Plot selections for sanity check
p <- ggplot(mdata.RA4to6_cluster_1_vs_all, aes(x, y, color = factor(cluster))) + 
  geom_point(size = 0.5) +
  facet_wrap(~Image_ID) + 
  theme_void() +
  labs(color = "cluster")
p

png(filename = "../all_clusters/cluster_1_vs_all_RA4to6_overview.png", width = 3000, height = 3000, res = 300)
print(p)
dev.off()
## png 
##   2

Export metadata and expression data for cluster 1 (seropositive) vs cluster 1 (seronegative) comparison. All patient datasets included.

# Export data for cluster 1 seropositive vs cluster 1 seronegative
mdata.seropositive_vs_seronegative <- setNames(metadata[metadata$cluster_1 %in% c("cluster_1_seropositive", "cluster_1_seronegative"), c("cell", "cluster_1")], nm = c("cell", "cluster"))
write.table(x = mdata.seropositive_vs_seronegative, 
            file = "../../cellphonedb/input/meta_seropositive_vs_seronegative.txt", 
            quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE)
fwrite(x = exprMat[, c(1, which(metadata$cluster_1 %in% c("cluster_1_seropositive", "cluster_1_seronegative")) + 1)], 
       file = "../../cellphonedb/input/counts_seropositive_vs_seronegative.txt", 
       sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE)
mdata.seropositive_vs_seronegative <- cbind(mdata.seropositive_vs_seronegative, res.merged[mdata.seropositive_vs_seronegative$cell, c("Image_ID", "x", "y")])

# Plot selections for sanity check
p <- ggplot(mdata.seropositive_vs_seronegative, aes(x, y, color = factor(cluster))) + 
  geom_point(size = 0.5) +
  facet_wrap(~Image_ID) + 
  theme_void() +
  labs(color = "cluster")
p

png(filename = "../all_clusters/cluster_1_seropositive_vs_seronegative.png", width = 3000, height = 3000, res = 300)
print(p)
dev.off()
## png 
##   2

Date

date()
## [1] "Wed Sep 15 15:48:47 2021"

Session info

sessionInfo()
## R version 4.0.4 (2021-02-15)
## Platform: x86_64-redhat-linux-gnu (64-bit)
## Running under: CentOS Linux 8
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/libopenblas-r0.3.3.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] data.table_1.14.0 ggplot2_3.3.3    
## 
## loaded via a namespace (and not attached):
##  [1] bslib_0.2.4       compiler_4.0.4    pillar_1.5.0      jquerylib_0.1.3  
##  [5] tools_4.0.4       digest_0.6.27     jsonlite_1.7.2    evaluate_0.14    
##  [9] lifecycle_1.0.0   tibble_3.0.6      gtable_0.3.0      pkgconfig_2.0.3  
## [13] rlang_0.4.10      DBI_1.1.1         crosstalk_1.1.1   yaml_2.2.1       
## [17] xfun_0.20         withr_2.4.1       stringr_1.4.0     dplyr_1.0.4      
## [21] knitr_1.30        generics_0.1.0    htmlwidgets_1.5.3 sass_0.3.1       
## [25] vctrs_0.3.6       grid_4.0.4        DT_0.17           tidyselect_1.1.0 
## [29] glue_1.4.2        R6_2.5.0          fansi_0.4.2       rmarkdown_2.7    
## [33] farver_2.0.3      purrr_0.3.4       magrittr_2.0.1    scales_1.1.1     
## [37] htmltools_0.5.1.1 ellipsis_0.3.1    assertthat_0.2.1  colorspace_2.0-0 
## [41] labeling_0.4.2    utf8_1.1.4        stringi_1.5.3     munsell_0.5.0    
## [45] crayon_1.4.1